Executive Summary:

Selecting the Study

Parsing DataMat

# file parsing and cleaning
d <- read.delim("/Users/paulcao/Downloads/GSE143290_SI_study_RawCounts.txt", header = TRUE,
    sep = "\t")
dataMat <- d[, c(8, 9, 10, 11, 12, 13)]
rownames(dataMat) <- d$Geneid
colnames(dataMat) <- c("VAS_Control_1", "VAS_Control_2", "VAS_Control_3", "VAS_Infected_1",
    "VAS_Infected_2", "VAS_Infected_3")
head(dataMat)
##        VAS_Control_1 VAS_Control_2 VAS_Control_3 VAS_Infected_1 VAS_Infected_2
## Xkr4               0             3             5              2              4
## Rp1                0             0             2              0              0
## Sox17             31            63            34             89             34
## Mrpl15          2081          1767          2733           1624           1963
## Lypla1          7946          7399          7717           7515           6790
## Tcea1           2885          2237          2808           2003           2558
##        VAS_Infected_3
## Xkr4                5
## Rp1                 2
## Sox17              35
## Mrpl15           1671
## Lypla1           6798
## Tcea1            2305

Gene of Interest Selection

Power analysis

RNAseqsamplesize [@Zhao2018] was used to do the analysis. With expected fold change between groups = 2, FDR set = 0.01 and number of samples = 5, power was computed to be 0.086 for the selected genes of interest which is very low. Some gene IDs pulled from Biomart are not in the dataset because they are alleles on alternative sequences (i.e. rapsn and Fbxo32) Dispersion was also computed as 0.04754.

# estimate gene read count and dispersion distribution
distribution <- est_count_dispersion(dataMat, group = c(0, 0, 0, 1, 1, 1))
## Disp = 0.05714 , BCV = 0.239
Gene.name <- c("Trex1", "Ifrd1", "Ddost", "Rpn1", "Stt3b", "Krtcap2", "Tmem258",
    "Ostc", "Rnaseh2a", "Dad1", "Tusc3", "Stt3a", "Ankmy2", "Mef2c", "Ifnl2", "Sap30",
    "Fau", "Gm9843", "Dixdc1", "Bhlhb9", "Hdac1", "Sin3b")
Gene.stable.ID <- sprintf("Gene % d", 1:22)

genelist <- data.frame(Gene.name, Gene.stable.ID)

power <- est_power_distribution(n = 5, f = 0.01, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
mean(power$power)  # 0.217285
## [1] 0.217285

From changing around FDR (fdr parameter in est_power_curve()) and coverage (lambda0 parameter in est_power_curve()) as well as making an optimization plot, it appears that at least 10 samples are needed with an average coverage > 10 reads/gene depending on desired FDR in order to achieve >80% power. To be precise, 10 samples with 10 coverage and FDR=0.05 will give 82% power. To be safe, for the genes of interest we would need at least 13 samples to achieve 80% power.

Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.

Red line is FDR=0.01, coverage=5. Blue line is FDR=0.05, coverage=5. Purple line is FDR=0.01, coverage=10. Green line is FDR=0.05, coverage=10. Yellow line is FDR=0.05, coverage=20.

est_power_curve

Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.

Blue to brown gradient shows power from 0 to 1. Here FDR=0.01.

Iterating and calculating powers through different sample-size (n):

est_power(n = 8, lambda0 = 20, phi0 = 0.07154, f = 0.05, m = 52000, m1 = 71)
## [1] 0.51
power10 <- est_power_distribution(n = 10, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power10$power)
## [1] 0.8668957
power13 <- est_power_distribution(n = 13, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power13$power)
## [1] 0.9587752
power15 <- est_power_distribution(n = 15, f = 0.05, rho = 2, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power15$power)
## [1] 0.9817761
power18 <- est_power_distribution(n = 18, f = 0.05, rho = 2, minAveCount = 15, distributionObject = distribution,
    selectedGenes = genelist$Gene.name, storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
mean(power18$power)
## [1] 0.9932155

Extract the mean gene read count across all samples:

gene_readcounts <- distribution$pseudo.counts.mean[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]
gene_dispersions <- distribution$tagwise.dispersion[which(names(distribution$pseudo.counts.mean) %in% genelist$Gene.name)]

gene_readcounts
##     Ankmy2      Ifrd1      Mef2c       Dad1        Fau    Tmem258    Krtcap2 
##  921.06499 1856.91929  225.89718 2459.88159 9697.99371 1441.02956  980.94882 
##       Ostc      Hdac1      Ddost       Rpn1      Tusc3      Sap30      Sin3b 
## 4170.09639 2309.83045 6186.40215 5731.93796  422.96087  253.98447 1791.29028 
##   Rnaseh2a      Stt3a     Dixdc1      Trex1      Stt3b     Bhlhb9 
##  451.07636 4412.36483   71.54271  333.20858 5433.68299   80.60489
mean(gene_readcounts)
## [1] 2461.636
mean(gene_dispersions)
## [1] 0.05031733
ggplot(data=data.frame(counts=as.numeric(gene_readcounts),name=as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)]))) + geom_bar(aes(x=reorder(name,-counts),y=counts),stat="identity") + scale_y_log10() + coord_flip() + xlab("Gene name") + ylab("Mean count per gene across samples") + theme(axis.text.y = element_text(size=7)) + ggtitle("Mean count per gene across samples from experimental data") + labs(caption="Each sample has approximately 50 million reads. Experiment ID GSE58669")

gene_readcounts
##     Ankmy2      Ifrd1      Mef2c       Dad1        Fau    Tmem258    Krtcap2 
##  921.06499 1856.91929  225.89718 2459.88159 9697.99371 1441.02956  980.94882 
##       Ostc      Hdac1      Ddost       Rpn1      Tusc3      Sap30      Sin3b 
## 4170.09639 2309.83045 6186.40215 5731.93796  422.96087  253.98447 1791.29028 
##   Rnaseh2a      Stt3a     Dixdc1      Trex1      Stt3b     Bhlhb9 
##  451.07636 4412.36483   71.54271  333.20858 5433.68299   80.60489

Sample_Size_Distribution (genelist of all Cadherin and Integrin Genes)

ss<-sample_size_distribution(f=0.05,distributionObject=distribution,selectedGenes=genelist$Gene.name,storeProcess = TRUE)
## Warning in selecteGeneByName(selectedGenes, distributionObject): 2 selectedGenes were not found in distributionObject, discard them for further analysis
ss
## $iter
## [1] 7
## 
## $f.root
## [1] 0.05291029
## 
## $root
## [1] 10
## 
## $process
##       N       Power
## [1,]  1 0.007816436
## [2,]  9 0.791027221
## [3,] 10 0.852910286
## [4,] 11 0.897237676
## [5,] 13 0.953507915
## [6,] 17 0.989958990
## [7,] 33 0.996009604
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 5e-02 1e+00 2e+00 1e+00
names <- as.character(genelist$Gene.name[genelist$Gene.name %in% names(gene_readcounts)])
to_drop <- names[which(gene_readcounts < 10)]
ss_drop10 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=10)),storeProcess = TRUE)
# 76.7% power with N=9, 83.1% power with N=10
ss_drop10
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04509534
## 
## $root
## [1] 9
## 
## $process
##       N     Power
## [1,]  1 0.0185601
## [2,]  5 0.4384431
## [3,]  7 0.6838682
## [4,]  8 0.7798710
## [5,]  9 0.8450953
## [6,] 17 0.9928774
## [7,] 33 0.9960096
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

Sample_Size_Distribution (genelist of all genes with read counts >= 80)

to_drop80 <- names[which(gene_readcounts < 80)]
ss_drop80 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
# basically when rho goes up and genes go down it works
# so what exactly is rho?
# basically effect size... so question: do we expect it to be bigger or smaller than in the 2 group situation?
ss_drop80
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04680084
## 
## $root
## [1] 9
## 
## $process
##       N      Power
## [1,]  1 0.01864329
## [2,]  5 0.44028725
## [3,]  7 0.68598219
## [4,]  8 0.78185105
## [5,]  9 0.84680084
## [6,] 17 0.99300038
## [7,] 33 0.99600921
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

Sample_Size_Distribution (genelist of all genes with read counts >= 200)

to_drop200 <- names[which(gene_readcounts < 200)]
ss_drop200 <- sample_size_distribution(f=0.1,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=200)),storeProcess = TRUE)

Sample_Size_Distribution (genelist of all genes with read counts >= 80 and minimum fold change >=2.5)

ss_drop80_fold2.5 <- sample_size_distribution(f=0.1,rho=2.5,distributionObject=distribution,selectedGenes=names(which(gene_readcounts>=80)),storeProcess = TRUE)
ss_drop80_fold2.5
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04128189
## 
## $root
## [1] 5
## 
## $process
##       N      Power
## [1,]  1 0.05840905
## [2,]  3 0.49249287
## [3,]  4 0.70251460
## [4,]  5 0.84128189
## [5,]  9 0.99056980
## [6,] 17 0.99601169
## [7,] 33 0.99600857
## 
## $parameters
##   power       m      m1     fdr       w     rho       k 
##     0.8 10000.0   100.0     0.1     1.0     2.5     1.0

Sample_Size_Distribution (genelist of all genes that start with ‘CAH’ and ‘ITGA’; a more restrictive list of Cadherin and Integrin without pseudogenes)

kept <- append(rownames(genelist[grepl("Trex[0-9]+$", genelist$Gene.name), ]), rownames(genelist[grepl("Ifrd[0-9]+$", genelist$Gene.name), ]))
genelist[kept,]$Gene.name[which(!genelist[kept,]$Gene.stable.ID %in% names(distribution$pseudo.counts.mean))]
## [1] "Trex1" "Ifrd1"
kept_final <- kept[which(genelist[kept,]$Gene.name %in% names(distribution$pseudo.counts.mean))]
power_kept <- est_power_distribution(n=6,f=0.1,m=52000,m1=4000,distributionObject=distribution,selectedGenes=c("Trex1", "Ifrd1"),storeProcess = TRUE)
mean(power_kept$power) #32% power with just kept genes and 6 samples each group
## [1] 0.8259586
ss_kept <- sample_size_distribution(f=0.1,rho=2,distributionObject=distribution,selectedGenes=c("Trex1", "Ifrd1"),storeProcess=TRUE)
ss_kept
## $iter
## [1] 7
## 
## $f.root
## [1] 0.04720392
## 
## $root
## [1] 9
## 
## $process
##       N      Power
## [1,]  1 0.01867239
## [2,]  5 0.44064713
## [3,]  7 0.68643015
## [4,]  8 0.78229602
## [5,]  9 0.84720392
## [6,] 17 0.99304494
## [7,] 33 0.99600599
## 
## $parameters
## power     m    m1   fdr     w   rho     k 
## 8e-01 1e+04 1e+02 1e-01 1e+00 2e+00 1e+00

List all of the Found Genes in the RNASeq expression data(not just the genelist) [45 Genes]

found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
nrow(found)
## [1] 20
found <- genelist[which(genelist$Gene.name %in% names(distribution$pseudo.counts.mean)),]
found
##    Gene.name Gene.stable.ID
## 1      Trex1        Gene  1
## 2      Ifrd1        Gene  2
## 3      Ddost        Gene  3
## 4       Rpn1        Gene  4
## 5      Stt3b        Gene  5
## 6    Krtcap2        Gene  6
## 7    Tmem258        Gene  7
## 8       Ostc        Gene  8
## 9   Rnaseh2a        Gene  9
## 10      Dad1       Gene  10
## 11     Tusc3       Gene  11
## 12     Stt3a       Gene  12
## 13    Ankmy2       Gene  13
## 14     Mef2c       Gene  14
## 16     Sap30       Gene  16
## 17       Fau       Gene  17
## 19    Dixdc1       Gene  19
## 20    Bhlhb9       Gene  20
## 21     Hdac1       Gene  21
## 22     Sin3b       Gene  22
n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=found$Gene.name,storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=genelist[kept_final,]$Gene.name,storeProcess=TRUE)$power)

# Trex1:
musk_id <- which(found$Gene.name=='Trex1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])

# Ifrd1:
ifrd1_id <- which(found$Gene.name=='Ifrd1')
ifrd1_power <- sapply(gene_powers, function(x) x[ifrd1_id])

total_power <- sapply(gene_powers, function(x) mean(x))

total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))

# Number of genes vs number of samples at power=0.8
d <- data.frame(n,num_genes)
ggplot(data=d,aes(x=n,y=num_genes,label=num_genes)) + geom_line() + geom_point() + theme_bw() + ylab("Number of genes with power > 0.8") + xlab("Number of samples in each group") + ggtitle("Number of genes of interest with sufficient power to detect\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2") + geom_text(data=subset(d, n>7),vjust=0,nudge_y=1) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,70,10))

# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))

# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,musk_power,ifrd1_power,total_power,gene_powers_random) %>% tidyr::gather("type","power",musk_power,ifrd1_power,total_power,gene_powers_random)
ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for CDH1 gene and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("musk_power","ifrd1_power","total_power","gene_powers_random"),labels=c("Trex1","Ifrd1","Total for interested genes","Random genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))

avg_count <- transmute(dataMat,m1=(Innervated_WT_1+Innervated_WT_2)/2,m2=(Denervated_WT_1+Denervated_WT_2)/2)
log2_fold <- log2(avg_count$m2/avg_count$m1)>2
total_de<-sum(log2_fold[!is.na(log2_fold)]) # about 4000
library(PROPER)
ngenes = nrow(dataMat)
simOptions = RNAseq.SimOptions.2grp(ngenes=ngenes,lBaselineExpr="bottomly",lOD="bottomly")
simres = runSims(sim.opts=simOptions,nsims=20)
powers = comparePower(simres, alpha.type='fdr', alpha.nominal=0.1, stratify.by='expr')

Fitting a Negative Binomial Distribution and Binning Genes into Highly, Medium and Lowly Expressed Genes

library(fitdistrplus)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:biomaRt':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: survival
samples <- as.numeric(array(distribution$pseudo.counts.mean))
samples <-  as.integer(samples)
negbinom <- fitdist(samples, "nbinom")
summary(negbinom)
## Fitting of the distribution ' nbinom ' by maximum likelihood 
## Parameters : 
##          estimate   Std. Error
## size    0.3214267  0.002736276
## mu   1401.5124212 21.297744213
## Loglikelihood:  -133454.8   AIC:  266913.6   BIC:  266929.1 
## Correlation matrix:
##              size           mu
## size 1.000000e+00 7.759517e-05
## mu   7.759517e-05 1.000000e+00
quantile(negbinom)
## Estimated quantiles for each specified probability (non-censored data)
##          p=0.1 p=0.2 p=0.3 p=0.4 p=0.5 p=0.6 p=0.7 p=0.8 p=0.9
## estimate     2    21    74   184   381   709  1250  2184  4098
plot(negbinom)

Binning the Genes into their expression level

Based on the quantiles as calculated by the fitted negative binomial distribution, we can bin the genes of interest into the following three bins:

Highly expressed genes (p >0.7 or gene count > 1250): c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc")

Medium expressioned genes (0.4<p<0.7) c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3")

Lowly expressioned genes (p<0.4): c("Dixdc1", "Bhlhb9") plus some randomly selected genes "c(Sox17", "Xkr4", "A830018L16Rik") to get to a total of 5 genes.

Plotting Power by Highly, Medium and Lowly Expressed Genes

n <- 1:15
gene_powers <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers,function(x) sum(x>0.8))
gene_powers_keptfinal <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Fau", "Dad1", "Sin3b", "Rpn1", "Ddost", "Ostc"),storeProcess=TRUE)$power)


gene_powers2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, maxAveCount = 50000, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)
num_genes <- sapply(gene_powers2,function(x) sum(x>0.8))
gene_powers_keptfinal2 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Trex1", "Rnaseh2a", "Mef2c", "Sap30", "Tusc3"),storeProcess=TRUE)$power)


gene_powers3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Dixdc1", "Bhlhb9", "Sox17", "Xkr4", "A830018L16Rik"), storeProcess=TRUE)$power)
#num_genes <- sapply(gene_powers3,function(x) sum(x>0.8))
#gene_powers_keptfinal3 <- lapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000,selectedGenes=c("Dixdc1", "Bhlhb9"),storeProcess=TRUE)$power)

# Trex1:
musk_id <- which(found$Gene.name=='Trex1')
musk_power <- sapply(gene_powers, function(x) x[musk_id])

# Ifrd1:
ifrd1_id <- which(found$Gene.name=='Ifrd1')
ifrd1_power <- sapply(gene_powers, function(x) x[ifrd1_id])

total_power_high <- sapply(gene_powers, function(x) mean(x))
total_power_medium <- sapply(gene_powers2, function(x) mean(x))
total_power_low <- sapply(gene_powers3, function(x) mean(x))


total_keptfinal <- sapply(gene_powers_keptfinal, function(x) mean(x))

# Gene powers random
gene_powers_random <- sapply(n, function(x) est_power_distribution(n=x, f=0.1, rho=2, distributionObject=distribution, m=52000,m1=4000))

# Power to detect musk and total power vs number of samples
d2 <- data.frame(n,total_power_high,total_power_medium,total_power_low,gene_powers_random) %>% tidyr::gather("type","power", total_power_high,total_power_medium,total_power_low,gene_powers_random)

ggplot(data=d2,aes(x=n)) + geom_line(aes(y=power,color=type)) + geom_point(aes(y=power,color=type)) + theme_bw() + ylab("Power") + xlab("Number of samples in each group") + ggtitle("Power for Highly, Medium and Lowly Expressed genes and total power\n as number of samples increases") + labs(caption="FDR=0.1,log fold change=2,number of random genes=100") + geom_hline(yintercept=0.8,linetype='dashed') + scale_color_discrete(name="Power",breaks=c("total_power_high","total_power_medium","total_power_low","gene_powers_random"),labels=c("Highly Expressed Genes", "Medium Expressed Genes","Lowly Expressed Genes","Random Genes")) + scale_x_continuous(breaks=seq(1,15,2)) + scale_y_continuous(breaks=seq(0,1,0.2))

References